Setup

# Setup file locations to import 
beer_csv_loc <- "./StartingDBs/Beers.csv"
breweries_csv_loc <- "./StartingDBs/Breweries.csv"

# Read in data 
beer_data <- read.csv(beer_csv_loc, header = TRUE)
brewery_data <- read.csv(breweries_csv_loc, header = TRUE)

# Data provided by Celia
# Celia's contributions prefied by cb_
cb_beer <- read.xlsx(file="./StartingDBs/Beers_Celia.xlsx", sheetIndex = 1, 
                     header=TRUE, stringsAsFactors = FALSE)
cb_brewery <- read.xlsx(file="./StartingDBs/Breweries_Celia.xlsx", sheetIndex = 1, 
                     header=TRUE, stringsAsFactors = FALSE)

Case study Questions

1. How many breweries are present in each state?

# Create a seperate data frame to store state data 
### Note: This data set counts the district of columbia as a state
num_of_breweries_by_state <- data.frame(table(brewery_data$State))

##### Celia Banks #####
#Begin prepare an interactive geo map of breweries by state
#set the map
leaflet() %>% addTiles()
#add popup for brewery information
brewery_map <- cb_brewery %>% mutate(popup_info=paste(Name,"<br/>",City,"<br>",State,"<br>"))
#tweak the map with info layout, set circles smaller to enhance visual
leaflet() %>% addTiles() %>% addCircleMarkers(data=brewery_map, 
                                              lat=~Latitude, 
                                              lng=~Longitude, 
                                              radius=~1, 
                                              popup=~popup_info)
#End prepare an interactive geo map of breweries by state#

#corresponding distribution plot of breweries by state
ggplot(brewery_map, aes(x=as.factor(State), color="blue")) +
       geom_bar(color="blue") +
       ggtitle("Distribution of Breweries by State") +
       xlab("State") +
       geom_text(stat='count', aes(label=after_stat(count)), vjust=-.25) +
       theme(legend.position="none", axis.text.x = element_text(angle = 90)) +
       theme(plot.title = element_text(hjust = 0.5))

#corresponding distribution plot of breweries by region
ggplot(brewery_map, aes(x=as.factor(Region), color="green")) +
       geom_bar(color="green") +
       ggtitle("Distribution of Breweries by Region") +
       xlab("Region") +
       geom_text(stat='count', aes(label=after_stat(count)), vjust=-.25) +
       theme(legend.position="none", axis.text.x = element_text(angle = 90)) +
       theme(plot.title = element_text(hjust = 0.5))

##### END CB #####

2. Merge beer data with the breweries data. Print the first 6 observations and

#the last six observations to check the merged file

# Change column name of beer_data to match brewery_data to use as a primary key
colnames(beer_data)[5] <- "Brew_ID"

# Merge two data bases using merge
full_brew_data <- merge(brewery_data, beer_data, by="Brew_ID")

# Rename columns 2 and 5 that were changed during the merge
colnames(full_brew_data)[2] <- "Brewery"
colnames(full_brew_data)[5] <- "Beer_Name"

# Print first 6 and last 6 observations 
head(full_brew_data, n=6)
##   Brew_ID            Brewery        City State     Beer_Name Beer_ID   ABV IBU
## 1       1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060  38
## 2       1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060  25
## 3       1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056  47
## 4       1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045  50
## 5       1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049  26
## 6       1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048  19
##                                 Style Ounces
## 1                         Pumpkin Ale     16
## 2                     American Porter     16
## 3 Extra Special / Strong Bitter (ESB)     16
## 4                        American IPA     16
## 5                  Milk / Sweet Stout     16
## 6                   English Brown Ale     16
tail(full_brew_data, n=6)
##      Brew_ID                       Brewery          City State
## 2405     556         Ukiah Brewing Company         Ukiah    CA
## 2406     557       Butternuts Beer and Ale Garrattsville    NY
## 2407     557       Butternuts Beer and Ale Garrattsville    NY
## 2408     557       Butternuts Beer and Ale Garrattsville    NY
## 2409     557       Butternuts Beer and Ale Garrattsville    NY
## 2410     558 Sleeping Lady Brewing Company     Anchorage    AK
##                      Beer_Name Beer_ID   ABV IBU                   Style Ounces
## 2405             Pilsner Ukiah      98 0.055  NA         German Pilsener     12
## 2406         Porkslap Pale Ale      49 0.043  NA American Pale Ale (APA)     12
## 2407           Snapperhead IPA      51 0.068  NA            American IPA     12
## 2408         Moo Thunder Stout      50 0.049  NA      Milk / Sweet Stout     12
## 2409  Heinnieweisse Weissebier      52 0.049  NA              Hefeweizen     12
## 2410 Urban Wilderness Pale Ale      30 0.049  NA        English Pale Ale     12
# Create CSV Files for head and tail 
head_output_csv <- "./output/head_beer_data.csv"
tail_output_csv <- "./output/tail_been_data.csv"
write.csv(head(full_brew_data, n=6), head_output_csv)
write.csv(tail(full_brew_data, n=6), tail_output_csv)

3. Address any values

# Create two new tables where one has only ABV with no <NA>'s and the other 
state_abv <- data.frame(State=full_brew_data$State, ABV=full_brew_data$ABV)
state_abv <- state_abv %>% drop_na(ABV)

# Do the same for IBU 
state_ibu <- data.frame(State=full_brew_data$State, IBU=full_brew_data$IBU)
state_ibu <- state_ibu %>% drop_na(IBU)

4. Compute the median alcohol content and international bitterness unit for each

##state. Plot a bar chart

# Find the average ABV for the state_abv table 
avg_state_abv <- ddply(state_abv, .(State), function(x) median(x$ABV))
colnames(avg_state_abv)[2] <- "Average_ABV" # Rename the column
# Drop DC. It's not a state....yet
avg_state_abv <- avg_state_abv[-c(8),] # DC is equal to position 8 on the table 
# Order the states by Decending ABV content 
avg_state_abv <- avg_state_abv[order(avg_state_abv$Average_ABV, decreasing=TRUE),]

# Find Average IBU
avg_state_ibu <- ddply(state_ibu, .(State), function(x) median(x$IBU))
colnames(avg_state_ibu)[2] <- "Average_IBU" # Rename column
# Order the states by IBU Decending 
avg_state_ibu <- avg_state_ibu[order(avg_state_ibu$Average_IBU, decreasing=TRUE),]

# Plot Average Alcohol by Volume per state
ggplot(avg_state_abv, aes(x=reorder(State, desc(Average_ABV)), y=Average_ABV, fill=State)) + 
       geom_col(show.legend = FALSE, width=.9, position="dodge") +
       ggtitle("Average ABV per State") +
       xlab("State") + 
       ylab("Average Alcohol by Volume") 

# Plot Average International Bitterness Unit per State 
ggplot(avg_state_ibu, aes(x=reorder(State, desc(Average_IBU)), y=Average_IBU, fill=State)) + 
       geom_col(show.legend = FALSE, width=.9, position="dodge") +
       ggtitle("Average IBU per State") + 
       xlab("State") + 
       ylab("Average IBU")

4.1 Additional data provided by Celia

##### Celia Code #####
merge_bb <- merge(cb_beer, cb_brewery, by.x="Brewery_id", by.y="Brew_ID")
#Drop latitude and longitude columns as no longer needed
drop <- c("Latitude","Longitude")

cb_merged_bb <- merge_bb[,!(names(merge_bb) %in% drop)]

#rename columns to avoid confusion
#cb_merged_bb <- cb_merged_bb %>% rename(Beer_Name = Name.x, Brewery_Name = Name.y)
### RAH ###
colnames(cb_merged_bb)[2] <- "Beer_Name"
colnames(cb_merged_bb)[8] <- "Brewery_Name"
### End RAH ###

#get count of breweries in each state
brewery_count <- aggregate(Brewery_id ~ State, cb_merged_bb, sum)
aggregate(cb_merged_bb$Brewery_id, by=list(Category=cb_merged_bb$State), FUN=sum)
##    Category     x
## 1        AK  8363
## 2        AL  4208
## 3        AR   820
## 4        AZ  9815
## 5        CA 46773
## 6        CO 65472
## 7        CT  9222
## 8        DC  1824
## 9        DE   857
## 10       FL 15781
## 11       GA  4662
## 12       HI  8521
## 13       IA 11756
## 14       ID  8605
## 15       IL 15231
## 16       IN 12855
## 17       KS  2899
## 18       KY  1334
## 19       LA  4817
## 20       MA 21195
## 21       MD  4789
## 22       ME  8142
## 23       MI 15130
## 24       MN  8687
## 25       MO 10063
## 26       MS  1698
## 27       MT 16604
## 28       NC 20439
## 29       ND  1008
## 30       NE  8855
## 31       NH  1508
## 32       NJ  1892
## 33       NM  5103
## 34       NV  4653
## 35       NY 19400
## 36       OH  7974
## 37       OK  5483
## 38       OR 26890
## 39       PA 26078
## 40       RI  5802
## 41       SC  3108
## 42       SD  1491
## 43       TN  2073
## 44       TX 25427
## 45       UT  7289
## 46       VA 11011
## 47       VT  6459
## 48       WA 25471
## 49       WI 18638
## 50       WV   314
## 51       WY  4438
#impute the rows with zeroes
cb_merged_bb$ABV[is.na(cb_merged_bb$ABV)] <- 0
cb_merged_bb$IBU[is.na(cb_merged_bb$IBU)] <- 0

#compute median ABV and IBU for each state
cb_medianABV <- aggregate(ABV ~ State + Region, cb_merged_bb, median)
cb_medianIBU <- aggregate(IBU ~ State + Region, cb_merged_bb, median)

#merge the medians dataframes
cb_mergeABVIBU <- merge(cb_medianABV, cb_medianIBU, by="State")
drop <- c("Region.y")
cb_mergeABVIBU = merge_bb[,!(names(merge_bb) %in% drop)]

#Visualize ABV v IBU medians by state and by region
options(scipen = 999)
cb_mergeABVIBU <- as.data.frame(cb_mergeABVIBU)

#by state
ggplot(cb_mergeABVIBU, aes(ABV, IBU, color=State)) +
       geom_point(show.legend = FALSE) +
       facet_wrap(vars(State)) 
## Warning: Removed 1005 rows containing missing values (geom_point).

#by region
ggplot(cb_mergeABVIBU, aes(ABV, IBU, color=Region)) +
       geom_point() +
       facet_wrap(vars(Region))
## Warning: Removed 1005 rows containing missing values (geom_point).

##### END #####

5. Which state has the the Max ABV. Which has the

# Get the State with the highest average ABV
state_max_abv <- avg_state_abv[which.max(avg_state_abv$Average_ABV),] # NV
# Get the State with the highest average IBU
state_max_ibu <- avg_state_ibu[which.max(avg_state_ibu$Average_IBU),] # WV

Nevada has the highest percentage of alcohal by volume while West Virginia has the highest IBU.

6. Comment on the Summary stats and distribution of the ABV variable

summary(state_abv)
##     State                ABV         
##  Length:2348        Min.   :0.00100  
##  Class :character   1st Qu.:0.05000  
##  Mode  :character   Median :0.05600  
##                     Mean   :0.05977  
##                     3rd Qu.:0.06700  
##                     Max.   :0.12800
ggplot(state_abv, aes(x=ABV)) + geom_histogram() # Right Skewness
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(state_abv, aes(x=ABV)) + geom_boxplot()

##7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content?

# Create a new data frame from the beer data that removes rows that have <NA> in 
# the ABV and IBU columns
beer_data_no_na <- beer_data
beer_data_no_na <- beer_data_no_na %>% drop_na(ABV)
beer_data_no_na <- beer_data_no_na %>% drop_na(IBU)

# Scatter plot to show raw data 
ggplot(beer_data_no_na, aes(x=ABV, y=IBU, colour=Style)) + 
       geom_point(show.legend = FALSE) + 
       ggtitle("ABV vs IBU") +
       xlab("ABV in %") + 
       ylab("IBU")

# Plot a smooth curve to see a more "linear" pattern 
ggplot(beer_data_no_na, aes(x=ABV, y=IBU)) + 
       geom_smooth() +
       ggtitle("ABV vs IBU") + 
       xlab("ABV in %") + 
       ylab("IBU")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

8. investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales)

#and other types of Ale. Use KNN.